## pval_cutoff: 0.05
## lfc_cutoff: 0.585
## low_counts_cutoff: 10

General statistics

# Number of samples
length(counts_data)
## [1] 6
# Number of genes
nrow(counts_data)
## [1] 43432
# Total counts
colSums(counts_data)
## SRR15202006 SRR15202007 SRR15202008 SRR15202009 SRR15202010 SRR15202011 
##    12633373    11169811    11283591    10889859    11241553     8610959

Create DDS objects

# Create DESeqDataSet object
dds <- get_DESeqDataSet_obj(counts_data, ~ treatment)
## [1] TRUE
## [1] TRUE
## [1] "DESeqDataSet object of length 43432 with 0 metadata columns"
## [1] "DESeqDataSet object of length 17151 with 1 metadata column"
colData(dds)
## DataFrame with 6 rows and 2 columns
##             treatment       label
##              <factor> <character>
## SRR15202006  control      control
## SRR15202007  control      control
## SRR15202008  control      control
## SRR15202009  diabetes    diabetes
## SRR15202010  diabetes    diabetes
## SRR15202011  diabetes    diabetes

Sample-to-sample comparisons

# Transform data (blinded rlog)
rld <- get_transformed_data(dds)

PCA plot

pca <- rld$pca
pca_df <- cbind(as.data.frame(colData(dds)) %>% rownames_to_column(var = 'name'), pca$x)
summary(pca)
## Importance of components:
##                           PC1    PC2    PC3    PC4     PC5       PC6
## Standard deviation     7.7864 6.3073 4.4317 3.8287 3.39901 5.433e-14
## Proportion of Variance 0.4145 0.2720 0.1343 0.1002 0.07899 0.000e+00
## Cumulative Proportion  0.4145 0.6865 0.8208 0.9210 1.00000 1.000e+00
ggplot(pca_df, aes(x = PC1, y = PC2, color = label)) +
  geom_point() +
  geom_text(aes(label = name), position = position_nudge(y = -2), show.legend = F, size = 3) +
  scale_color_manual(values = colors_default) +
  scale_x_continuous(expand = c(0.2, 0))

Correlation heatmap

pheatmap(
  cor(rld$matrix),
  annotation_col = as.data.frame(colData(dds)) %>% select(label),
  color = brewer.pal(8, 'YlOrRd')
)

Wald test results

# DE analysis using Wald test
dds_full <- DESeq(dds)
colData(dds_full)
## DataFrame with 6 rows and 3 columns
##             treatment       label sizeFactor
##              <factor> <character>  <numeric>
## SRR15202006  control      control   1.204378
## SRR15202007  control      control   1.032141
## SRR15202008  control      control   1.025126
## SRR15202009  diabetes    diabetes   1.016402
## SRR15202010  diabetes    diabetes   1.027306
## SRR15202011  diabetes    diabetes   0.762996
# Wald test results
res <- results(
  dds_full,
  contrast = c('treatment', condition, control),
  alpha = pval_cutoff
)
res
## log2 fold change (MLE): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 17151 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##                    <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951   2.95090      0.2797033  1.194292  0.234200 0.8148297        NA
## ENSMUSG00000025900   7.97288     -2.1413277  3.306479 -0.647616 0.5172335  0.954317
## ENSMUSG00000033845 601.46353     -0.0187531  0.131829 -0.142253 0.8868800  0.997519
## ENSMUSG00000102275  11.37339      0.1271374  0.616188  0.206329 0.8365339  0.995909
## ENSMUSG00000025903 784.45488     -0.3325434  0.136983 -2.427628 0.0151979  0.239833
## ...                      ...            ...       ...       ...       ...       ...
## ENSMUSG00000099876  42.68208       0.204127  0.345933  0.590077 0.5551392  0.959866
## ENSMUSG00000068457 262.27768      -0.402218  0.195007 -2.062587 0.0391519  0.408271
## ENSMUSG00000069045 765.28122      -0.194585  0.123372 -1.577224 0.1147439  0.644345
## ENSMUSG00000101059   4.62892      -0.766518  1.020491 -0.751127 0.4525764        NA
## ENSMUSG00000096768 223.28712      -0.472724  0.694267 -0.680897 0.4959370  0.952825
mcols(res)
## DataFrame with 6 rows and 2 columns
##                        type            description
##                 <character>            <character>
## baseMean       intermediate mean of normalized c..
## log2FoldChange      results log2 fold change (ML..
## lfcSE               results standard error: trea..
## stat                results Wald statistic: trea..
## pvalue              results Wald test p-value: t..
## padj                results   BH adjusted p-values
summary(res)
## 
## out of 17151 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 220, 1.3%
## LFC < 0 (down)     : 166, 0.97%
## outliers [1]       : 5, 0.029%
## low counts [2]     : 2328, 14%
## (mean count < 5)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
plotDispEsts(dds_full)

Summary details

# Upregulated genes (LFC > 0)
res_sig_df %>% filter(log2FoldChange > 0)
# Downregulated genes (LFC < 0)
res_sig_df %>% filter(log2FoldChange < 0)
# Outliers (pvalue and padj are NA)
res[which(is.na(res$pvalue)), ]
## log2 fold change (MLE): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 5 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat    pvalue      padj
##                    <numeric>      <numeric> <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000058207   13.5434       -6.11313   3.35524 -1.821964        NA        NA
## ENSMUSG00000027360   43.9822        2.26098   1.47672  1.531080        NA        NA
## ENSMUSG00000029368  113.3851       -6.56226   2.43612 -2.693731        NA        NA
## ENSMUSG00000030324  111.9962       -1.67520   1.72582 -0.970664        NA        NA
## ENSMUSG00000034837   44.1043       -1.47704   1.66797 -0.885530        NA        NA
# Low counts (only padj is NA)
res[which(is.na(res$padj) & !is.na(res$pvalue)), ]
## log2 fold change (MLE): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 2328 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE       stat    pvalue      padj
##                    <numeric>      <numeric> <numeric>  <numeric> <numeric> <numeric>
## ENSMUSG00000051951   2.95090      0.2797033   1.19429  0.2342000 0.8148297        NA
## ENSMUSG00000062588   1.55596     -0.4198212   1.65686 -0.2533830 0.7999723        NA
## ENSMUSG00000097797   2.82657      0.0206645   1.20774  0.0171101 0.9863488        NA
## ENSMUSG00000076135   1.71889     -1.2624674   1.72109 -0.7335278 0.4632366        NA
## ENSMUSG00000079671   2.75022      3.0156568   1.49177  2.0215301 0.0432249        NA
## ...                      ...            ...       ...        ...       ...       ...
## ENSMUSG00000084920   3.39025     -0.1024447   1.16976 -0.0875778  0.930212        NA
## ENSMUSG00000084806   2.03762     -0.2693107   1.41819 -0.1898976  0.849389        NA
## ENSMUSG00000049176   1.85195     -0.0168883   1.45129 -0.0116368  0.990715        NA
## ENSMUSG00000087159   3.24779     -0.2997782   1.22270 -0.2451773  0.806319        NA
## ENSMUSG00000101059   4.62892     -0.7665180   1.02049 -0.7511268  0.452576        NA

Shrunken LFC results

plotMA(res)

# Shrunken LFC results
res_shrunken <- lfcShrink(
  dds_full,
  coef = str_c('treatment_', condition, '_vs_', control),
  type = 'apeglm'
)
res_shrunken
## log2 fold change (MAP): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 17151 rows and 5 columns
##                     baseMean log2FoldChange     lfcSE    pvalue      padj
##                    <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951   2.95090     0.00187171 0.0971667 0.8148297        NA
## ENSMUSG00000025900   7.97288    -0.00158854 0.0974560 0.5172335        NA
## ENSMUSG00000033845 601.46353    -0.00604166 0.0785064 0.8868800  0.995641
## ENSMUSG00000102275  11.37339     0.00313091 0.0963428 0.8365339  0.990912
## ENSMUSG00000025903 784.45488    -0.20689645 0.1565421 0.0151979  0.226348
## ...                      ...            ...       ...       ...       ...
## ENSMUSG00000099876  42.68208     0.01527422 0.0954221 0.5551392  0.944317
## ENSMUSG00000068457 262.27768    -0.12770473 0.1809914 0.0391519  0.385684
## ENSMUSG00000069045 765.28122    -0.09192614 0.1033071 0.1147439  0.618083
## ENSMUSG00000101059   4.62892    -0.00696389 0.0973919 0.4525764        NA
## ENSMUSG00000096768 223.28712    -0.00906695 0.0971539 0.4959370  0.935226
plotMA(res_shrunken)

mcols(res_shrunken)
## DataFrame with 5 rows and 2 columns
##                        type            description
##                 <character>            <character>
## baseMean       intermediate mean of normalized c..
## log2FoldChange      results log2 fold change (MA..
## lfcSE               results posterior SD: treatm..
## pvalue              results Wald test p-value: t..
## padj                results   BH adjusted p-values
summary(res_shrunken, alpha = pval_cutoff)
## 
## out of 17151 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up)       : 219, 1.3%
## LFC < 0 (down)     : 167, 0.97%
## outliers [1]       : 5, 0.029%
## low counts [2]     : 3325, 19%
## (mean count < 8)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Summary details

# Upregulated genes (LFC > 0)
res_shrunken_sig_df %>% filter(log2FoldChange > 0)
# Downregulated genes (LFC < 0)
res_shrunken_sig_df %>% filter(log2FoldChange < 0)
# Outliers (pvalue and padj are NA)
res_shrunken[which(is.na(res_shrunken$pvalue)), ]
## log2 fold change (MAP): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 5 rows and 5 columns
##                     baseMean log2FoldChange     lfcSE    pvalue      padj
##                    <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000058207   13.5434    -0.00268899 0.0975156        NA        NA
## ENSMUSG00000027360   43.9822     0.00841953 0.0978813        NA        NA
## ENSMUSG00000029368  113.3851    -0.00476900 0.0976345        NA        NA
## ENSMUSG00000030324  111.9962    -0.00479276 0.0975251        NA        NA
## ENSMUSG00000034837   44.1043    -0.00464703 0.0974973        NA        NA
# Low counts (only padj is NA)
res_shrunken[which(is.na(res_shrunken$padj) & !is.na(res_shrunken$pvalue)), ]
## log2 fold change (MAP): treatment diabetes vs control 
## Wald test p-value: treatment diabetes vs control 
## DataFrame with 3325 rows and 5 columns
##                     baseMean log2FoldChange     lfcSE    pvalue      padj
##                    <numeric>      <numeric> <numeric> <numeric> <numeric>
## ENSMUSG00000051951   2.95090     0.00187171 0.0971667  0.814830        NA
## ENSMUSG00000025900   7.97288    -0.00158854 0.0974560  0.517234        NA
## ENSMUSG00000062588   1.55596    -0.00146117 0.0973093  0.799972        NA
## ENSMUSG00000102135   6.74677    -0.00634244 0.0970827  0.579525        NA
## ENSMUSG00000103509   6.69317    -0.00234133 0.0969430  0.818834        NA
## ...                      ...            ...       ...       ...       ...
## ENSMUSG00000044583   7.56298    2.14353e-02 0.1002740 0.0729896        NA
## ENSMUSG00000049176   1.85195   -6.65622e-05 0.0972452 0.9907154        NA
## ENSMUSG00000087159   3.24779   -1.86927e-03 0.0971818 0.8063191        NA
## ENSMUSG00000072844   5.19664   -1.02531e-02 0.0978068 0.2827699        NA
## ENSMUSG00000101059   4.62892   -6.96389e-03 0.0973919 0.4525764        NA

Visualizing results

Heatmaps

# Plot normalized counts (z-scores)
pheatmap(counts_sig_norm[2:7], 
         color = brewer.pal(8, 'YlOrRd'), 
         cluster_rows = T, 
         show_rownames = F,
         annotation_col = as.data.frame(colData(dds)) %>% select(label),
         border_color = NA,
         fontsize = 10,
         scale = 'row',
         fontsize_row = 10, 
         height = 20)

# Plot log-transformed counts
pheatmap(counts_sig_log[2:7], 
         color = rev(brewer.pal(8, 'RdYlBu')), 
         cluster_rows = T, 
         show_rownames = F,
         annotation_col = as.data.frame(colData(dds)) %>% select(label),
         border_color = NA,
         fontsize = 10,
         fontsize_row = 10, 
         height = 20)

# Plot log-transformed counts (top 24 DE genes)
pheatmap(counts_sig_log %>% filter(ensembl_gene_id %in% (res_sig_df %>% head(24))$ensembl_gene_id) %>% select(-ensembl_gene_id) %>% column_to_rownames(var = 'mgi_symbol'),
         color = rev(brewer.pal(8, 'RdYlBu')), 
         cluster_rows = T, 
         show_rownames = T,
         annotation_col = as.data.frame(colData(dds)) %>% select(label), 
         fontsize = 10,
         fontsize_row = 10, 
         height = 20)

Volcano plots

# Unshrunken LFC
res_df %>% 
  mutate(
    sig_threshold = if_else(
      padj < pval_cutoff & abs(log2FoldChange) >= lfc_cutoff,
      if_else(log2FoldChange > 0, 'DE-up', 'DE-down'),
      'non-DE'
    )
  ) %>% 
  filter(!is.na(sig_threshold)) %>% 
  ggplot() +
  geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = sig_threshold)) +
  scale_color_manual(values = c('blue', 'red', 'gray')) +
  xlab('log2 fold change') + 
  ylab('-log10 adjusted p-value')

# Shrunken LFC
res_shrunken_df %>% 
  mutate(
    sig_threshold = if_else(
      padj < pval_cutoff & abs(log2FoldChange) >= lfc_cutoff,
      if_else(log2FoldChange > 0, 'DE-up', 'DE-down'),
      'non-DE'
    )
  ) %>% 
  filter(!is.na(sig_threshold)) %>% 
  ggplot() +
  geom_point(aes(x = log2FoldChange, y = -log10(padj), colour = sig_threshold)) +
  scale_color_manual(values = c('blue', 'red', 'gray')) +
  xlab('log2 fold change') + 
  ylab('-log10 adjusted p-value')

GSEA (all)

Hallmark genesets

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_h) %>% plot_enrichment_table(rank_lfc, mm_h)

# Wald stat
get_fgsea_res(rank_stat, mm_h) %>% plot_enrichment_table(rank_stat, mm_h)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_h) %>% plot_enrichment_table(rank_pval, mm_h)

GO biological process

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_bp) %>% plot_enrichment_table(rank_lfc, mm_c5_bp)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_bp) %>% plot_enrichment_table(rank_stat, mm_c5_bp)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_bp) %>% plot_enrichment_table(rank_pval, mm_c5_bp)

GO cellular component

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_cc) %>% plot_enrichment_table(rank_lfc, mm_c5_cc)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_cc) %>% plot_enrichment_table(rank_stat, mm_c5_cc)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_cc) %>% plot_enrichment_table(rank_pval, mm_c5_cc)

GO molecular function

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_mf) %>% plot_enrichment_table(rank_lfc, mm_c5_mf)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_mf) %>% plot_enrichment_table(rank_stat, mm_c5_mf)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_mf) %>% plot_enrichment_table(rank_pval, mm_c5_mf)

GSEA (DE)

Hallmark genesets

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_h) %>% plot_enrichment_table(rank_lfc, mm_h)

# Wald stat
get_fgsea_res(rank_stat, mm_h) %>% plot_enrichment_table(rank_stat, mm_h)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_h) %>% plot_enrichment_table(rank_pval, mm_h)

GO biological process

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_bp) %>% plot_enrichment_table(rank_lfc, mm_c5_bp)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_bp) %>% plot_enrichment_table(rank_stat, mm_c5_bp)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_bp) %>% plot_enrichment_table(rank_pval, mm_c5_bp)

GO cellular component

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_cc) %>% plot_enrichment_table(rank_lfc, mm_c5_cc)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_cc) %>% plot_enrichment_table(rank_stat, mm_c5_cc)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_cc) %>% plot_enrichment_table(rank_pval, mm_c5_cc)

GO molecular function

# Shrunken LFC
get_fgsea_res(rank_lfc, mm_c5_mf) %>% plot_enrichment_table(rank_lfc, mm_c5_mf)

# Wald stat
get_fgsea_res(rank_stat, mm_c5_mf) %>% plot_enrichment_table(rank_stat, mm_c5_mf)

# Rank: sign(LFC) * -log10(pvalue)
get_fgsea_res(rank_pval, mm_c5_mf) %>% plot_enrichment_table(rank_pval, mm_c5_mf)

System info

sessionInfo()
## R version 4.1.0 (2021-05-18)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: CentOS Linux 7 (Core)
## 
## Matrix products: default
## BLAS/LAPACK: /home/chan/mRNA_seq_pipeline/.snakemake/conda/9a19315a020c824d12f8055f7c009b0f/lib/libopenblasp-r0.3.18.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=en_US.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] fgsea_1.20.0                RColorBrewer_1.1-2          pheatmap_1.0.12             DESeq2_1.34.0               SummarizedExperiment_1.24.0 Biobase_2.54.0              MatrixGenerics_1.6.0        matrixStats_0.61.0          GenomicRanges_1.46.0        GenomeInfoDb_1.30.0         IRanges_2.28.0              S4Vectors_0.32.0            BiocGenerics_0.40.0         scales_1.1.1                forcats_0.5.1               stringr_1.4.0               dplyr_1.0.7                 purrr_0.3.4                 readr_2.1.1                 tidyr_1.1.4                 tibble_3.1.6                ggplot2_3.3.5               tidyverse_1.3.1            
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-2       ellipsis_0.3.2         XVector_0.34.0         fs_1.5.1               rstudioapi_0.13        farver_2.1.0           bit64_4.0.5            mvtnorm_1.1-3          AnnotationDbi_1.56.1   fansi_0.4.2            apeglm_1.16.0          lubridate_1.8.0        xml2_1.3.3             splines_4.1.0          cachem_1.0.6           geneplotter_1.72.0     knitr_1.35             jsonlite_1.7.2         broom_0.7.10           annotate_1.72.0        dbplyr_2.1.1           png_0.1-7              compiler_4.1.0         httr_1.4.2             backports_1.4.0        assertthat_0.2.1       Matrix_1.3-4           fastmap_1.1.0          cli_3.1.0              htmltools_0.5.2        tools_4.1.0            coda_0.19-4            gtable_0.3.0           glue_1.5.1             GenomeInfoDbData_1.2.7 fastmatch_1.1-3        Rcpp_1.0.7             bbmle_1.0.24           cellranger_1.1.0       jquerylib_0.1.4        vctrs_0.3.8            Biostrings_2.62.0      xfun_0.28              rvest_1.0.2            lifecycle_1.0.1        XML_3.99-0.8           MASS_7.3-54            zlibbioc_1.40.0        vroom_1.5.7            hms_1.1.1              parallel_4.1.0         yaml_2.2.1             memoise_2.0.1          gridExtra_2.3          emdbook_1.3.12         bdsmatrix_1.3-4        stringi_1.7.6          RSQLite_2.2.8          highr_0.9              genefilter_1.76.0      BiocParallel_1.28.0    rlang_0.4.12           pkgconfig_2.0.3        bitops_1.0-7           evaluate_0.14          lattice_0.20-45        labeling_0.4.2         bit_4.0.4              tidyselect_1.1.1       plyr_1.8.6             magrittr_2.0.1         R6_2.5.1               generics_0.1.1         DelayedArray_0.20.0    DBI_1.1.1              pillar_1.6.4           haven_2.4.3            withr_2.4.3            survival_3.2-13        KEGGREST_1.34.0        RCurl_1.98-1.5         modelr_0.1.8           crayon_1.4.2           utf8_1.2.2             tzdb_0.2.0             rmarkdown_2.11         locfit_1.5-9.4         grid_4.1.0             readxl_1.3.1           data.table_1.14.2      blob_1.2.2             reprex_2.0.1           digest_0.6.29          xtable_1.8-4           numDeriv_2016.8-1.1    munsell_0.5.0